/*
 * Copyright (c) 2021 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "arm_gemm.hpp"

#include <cstddef>
#include <cstdint>

#if defined(__aarch64__)

namespace arm_conv {
namespace depthwise {

void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
  const unsigned int n_channels,
  const uint8_t *const *const inptrs,
  const int8_t *const weights,
  const int32_t *const bias,
  const arm_gemm::Requantize32 &qp,
  const int32_t *const requant_muls,
  const int32_t *const requant_shifts,
  uint8_t *const *const outptrs
)
{
  struct Params
  {
    long unsigned int n_channels;
    const int8_t *weights;
    const int32_t *bias;
    const arm_gemm::Requantize32 *requant;
    const int32_t *const requant_muls;
    const int32_t *const requant_shifts;
    uint8_t *const *const outptrs;
    const uint8_t *inptrs[36];

    Params(
      long unsigned int n_channels,
      const uint8_t *const *inptrs_raw,
      const int8_t *const weights,
      const int32_t *const bias,
      const arm_gemm::Requantize32 &qp,
      const int32_t *const requant_muls,
      const int32_t *const requant_shifts,
      uint8_t *const *outptrs
    ) : n_channels(n_channels), weights(weights), bias(bias),
        requant(&qp), requant_muls(requant_muls),
        requant_shifts(requant_shifts), outptrs(outptrs)
    {
      inptrs[0] = inptrs_raw[0];
      inptrs[1] = inptrs_raw[1];
      inptrs[2] = inptrs_raw[6];
      inptrs[3] = inptrs_raw[7];
      inptrs[4] = inptrs_raw[2];
      inptrs[5] = inptrs_raw[8];
      inptrs[6] = inptrs_raw[3];
      inptrs[7] = inptrs_raw[4];
      inptrs[8] = inptrs_raw[11];
      inptrs[9] = inptrs_raw[12];
      inptrs[10] = inptrs_raw[9];
      inptrs[11] = inptrs_raw[10];
      inptrs[12] = inptrs_raw[5];
      inptrs[13] = inptrs_raw[13];
      inptrs[14] = inptrs_raw[14];
      inptrs[15] = inptrs_raw[15];
      inptrs[16] = inptrs_raw[16];
      inptrs[17] = inptrs_raw[17];
      inptrs[18] = inptrs_raw[18];
      inptrs[19] = inptrs_raw[19];
      inptrs[20] = inptrs_raw[20];
      inptrs[21] = inptrs_raw[21];
      inptrs[22] = inptrs_raw[22];
      inptrs[23] = inptrs_raw[23];
      inptrs[24] = inptrs_raw[24];
      inptrs[25] = inptrs_raw[25];
      inptrs[26] = inptrs_raw[26];
      inptrs[27] = inptrs_raw[27];
      inptrs[28] = inptrs_raw[28];
      inptrs[29] = inptrs_raw[29];
      inptrs[30] = inptrs_raw[30];
      inptrs[31] = inptrs_raw[31];
      inptrs[32] = inptrs_raw[32];
      inptrs[33] = inptrs_raw[33];
      inptrs[34] = inptrs_raw[34];
      inptrs[35] = inptrs_raw[35];

    }
  };

  const Params params(n_channels, inptrs, weights, bias, qp,
                      requant_muls, requant_shifts, outptrs);

  __asm__ __volatile__(
    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
    "mov x10, #0x0\n"
    "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
    "mov x1, #0x0\n"
    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
    "add x25, %x[params], %[offsetof_Params_inptrs]\n"
    "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
    "lsr x19, x4, #0x3\n"
    "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
    "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
    "ld1r { v9.16b }, [x13]\n"
    "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
    "ld1r { v14.16b }, [x20]\n"
    "add x20, x22, %[offsetof_Requantize32_minval]\n"
    "ld1r { v10.4s }, [x8]\n"
    "add x8, x22, %[offsetof_Requantize32_maxval]\n"
    "ld1r { v11.4s }, [x20]\n"
    "ld1r { v13.4s }, [x8]\n"
    "ldp x17, x16, [x21, #0x0]\n"
    "ldp x6, x8, [x21, #0x10]\n"
    "cbz x19, 3f\n"
    "subs x19, x19, #0x1\n"
    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
    "ldr q15, [x12, #0x0]\n"
    "mov v16.16b, v15.16b\n"
    "ldr q18, [x12, #0x10]\n"
    "add x12, x12, #0x20\n"
    "mov v7.16b, v15.16b\n"
    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
    "mov v8.16b, v15.16b\n"
    "ldr d0, [x3, #0x0]\n"
    "ldr d1, [x3, #0x8]\n"
    "mov v21.16b, v18.16b\n"
    "ldr d2, [x3, #0x10]\n"
    "mov v17.16b, v18.16b\n"
    "ldr d3, [x3, #0x18]\n"
    "mov v5.16b, v18.16b\n"
    "ldr d4, [x3, #0x20]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "ldp x28, x27, [x25, #0x0]\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "ldp x26, x13, [x25, #0x10]\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "ldp x24, x23, [x25, #0x20]\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "ldp x22, x21, [x25, #0x30]\n"
    "ldp x20, x0, [x25, #0x40]\n"
    "ldr d31, [x28, x10]\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr d30, [x27, x10]\n"
    "ldr d29, [x26, x10]\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ldr d28, [x13, x10]\n"
    "usubl v29.8h, v29.8b, v9.8b\n"
    "ldr d27, [x24, x10]\n"
    "ldr d23, [x23, x10]\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ldr d25, [x22, x10]\n"
    "ldr d24, [x21, x10]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ldr d26, [x20, x10]\n"
    "ldr d22, [x0, x10]\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "usubl v22.8h, v22.8b, v9.8b\n"
    "beq 2f\n"
    "1:"  // Loop
    "smlal v15.4s, v31.4h, v0.4h\n"
    "ldr x20, [x25, #0x50]\n"
    "subs x19, x19, #0x1\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "ldr x28, [x25, #0x58]\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "ldr x0, [x25, #0x60]\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "ldr d31, [x20, x10]\n"
    "smlal v7.4s, v29.4h, v0.4h\n"
    "ldr x7, [x25, #0x68]\n"
    "smlal2 v17.4s, v29.8h, v0.8h\n"
    "ldr x26, [x25, #0x70]\n"
    "smlal v8.4s, v28.4h, v0.4h\n"
    "ldr x23, [x25, #0x78]\n"
    "smlal2 v5.4s, v28.8h, v0.8h\n"
    "ldr d0, [x3, #0x28]\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "ldr x20, [x25, #0x80]\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "ldr d30, [x28, x10]\n"
    "smlal v16.4s, v27.4h, v1.4h\n"
    "ldr x22, [x25, #0x88]\n"
    "smlal2 v21.4s, v27.8h, v1.8h\n"
    "ldr x13, [x25, #0x90]\n"
    "smlal v7.4s, v28.4h, v1.4h\n"
    "ldr x21, [x25, #0x98]\n"
    "smlal2 v17.4s, v28.8h, v1.8h\n"
    "ldr x14, [x25, #0xa0]\n"
    "smlal v8.4s, v23.4h, v1.4h\n"
    "ldr x11, [x25, #0xa8]\n"
    "smlal2 v5.4s, v23.8h, v1.8h\n"
    "ldr d1, [x3, #0x30]\n"
    "smlal v15.4s, v27.4h, v2.4h\n"
    "ldr x24, [x25, #0xb0]\n"
    "smlal2 v18.4s, v27.8h, v2.8h\n"
    "ldr d27, [x0, x10]\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "ldr x0, [x25, #0xb8]\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "ldr x15, [x25, #0xc0]\n"
    "smlal v7.4s, v23.4h, v2.4h\n"
    "ldr x9, [x25, #0xc8]\n"
    "smlal2 v17.4s, v23.8h, v2.8h\n"
    "ldr x27, [x25, #0xd0]\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr x28, [x25, #0xd8]\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "ldr q6, [x2, #0x0]\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "ldr d25, [x7, x10]\n"
    "smlal v8.4s, v31.4h, v2.4h\n"
    "ldr x12, [x25, #0xe0]\n"
    "smlal2 v5.4s, v31.8h, v2.8h\n"
    "ldr d2, [x3, #0x38]\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "ldr q19, [x5, #0x0]\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "ldr q20, [x2, #0x10]\n"
    "add x2, x2, #0x20\n"
    "smlal v7.4s, v31.4h, v3.4h\n"
    "ldr q12, [x5, #0x10]\n"
    "add x5, x5, #0x20\n"
    "smlal2 v17.4s, v31.8h, v3.8h\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "ldr d24, [x26, x10]\n"
    "smlal v8.4s, v30.4h, v3.4h\n"
    "ldr x7, [x25, #0xe8]\n"
    "smlal2 v5.4s, v30.8h, v3.8h\n"
    "ldr d3, [x3, #0x40]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "smlal v7.4s, v30.4h, v4.4h\n"
    "smlal2 v17.4s, v30.8h, v4.8h\n"
    "smlal v16.4s, v27.4h, v4.4h\n"
    "smlal2 v21.4s, v27.8h, v4.8h\n"
    "ldr d27, [x23, x10]\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "ldr x26, [x25, #0xf0]\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ldr d4, [x3, #0x48]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v29.4h, v0.4h\n"
    "smlal2 v18.4s, v29.8h, v0.8h\n"
    "smlal v16.4s, v28.4h, v0.4h\n"
    "smlal2 v21.4s, v28.8h, v0.8h\n"
    "smlal v7.4s, v22.4h, v0.4h\n"
    "smlal2 v17.4s, v22.8h, v0.8h\n"
    "smlal v8.4s, v25.4h, v0.4h\n"
    "smlal2 v5.4s, v25.8h, v0.8h\n"
    "ldr d0, [x3, #0x50]\n"
    "smlal v15.4s, v28.4h, v1.4h\n"
    "smlal2 v18.4s, v28.8h, v1.8h\n"
    "ldr d28, [x22, x10]\n"
    "smlal v16.4s, v23.4h, v1.4h\n"
    "ldr x23, [x25, #0xf8]\n"
    "smlal2 v21.4s, v23.8h, v1.8h\n"
    "smlal v7.4s, v25.4h, v1.4h\n"
    "smlal2 v17.4s, v25.8h, v1.8h\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "smlal v8.4s, v24.4h, v1.4h\n"
    "smlal2 v5.4s, v24.8h, v1.8h\n"
    "ldr d1, [x3, #0x58]\n"
    "smlal v15.4s, v23.4h, v2.4h\n"
    "smlal2 v18.4s, v23.8h, v2.8h\n"
    "ldr d23, [x20, x10]\n"
    "smlal v16.4s, v31.4h, v2.4h\n"
    "ldr x22, [x25, #0x100]\n"
    "smlal2 v21.4s, v31.8h, v2.8h\n"
    "smlal v7.4s, v24.4h, v2.4h\n"
    "smlal2 v17.4s, v24.8h, v2.8h\n"
    "smlal v8.4s, v27.4h, v2.4h\n"
    "smlal2 v5.4s, v27.8h, v2.8h\n"
    "ldr d2, [x3, #0x60]\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v31.4h, v3.4h\n"
    "smlal2 v18.4s, v31.8h, v3.8h\n"
    "ldr d31, [x13, x10]\n"
    "smlal v16.4s, v30.4h, v3.4h\n"
    "ldr x20, [x25, #0x108]\n"
    "smlal2 v21.4s, v30.8h, v3.8h\n"
    "smlal v7.4s, v27.4h, v3.4h\n"
    "smlal2 v17.4s, v27.8h, v3.8h\n"
    "smlal v8.4s, v23.4h, v3.4h\n"
    "smlal2 v5.4s, v23.8h, v3.8h\n"
    "ldr d3, [x3, #0x68]\n"
    "smlal v15.4s, v30.4h, v4.4h\n"
    "smlal2 v18.4s, v30.8h, v4.8h\n"
    "ldr d30, [x21, x10]\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "ldr x13, [x25, #0x110]\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "ldr d26, [x14, x10]\n"
    "smlal v7.4s, v23.4h, v4.4h\n"
    "ldr x21, [x25, #0x118]\n"
    "smlal2 v17.4s, v23.8h, v4.8h\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "smlal v8.4s, v28.4h, v4.4h\n"
    "smlal2 v5.4s, v28.8h, v4.8h\n"
    "ldr d4, [x3, #0x70]\n"
    "smlal v15.4s, v22.4h, v0.4h\n"
    "smlal2 v18.4s, v22.8h, v0.8h\n"
    "ldr d22, [x0, x10]\n"
    "smlal v16.4s, v25.4h, v0.4h\n"
    "smlal2 v21.4s, v25.8h, v0.8h\n"
    "smlal v7.4s, v31.4h, v0.4h\n"
    "smlal2 v17.4s, v31.8h, v0.8h\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "smlal v8.4s, v30.4h, v0.4h\n"
    "smlal2 v5.4s, v30.8h, v0.8h\n"
    "ldr d0, [x3, #0x78]\n"
    "smlal v15.4s, v25.4h, v1.4h\n"
    "smlal2 v18.4s, v25.8h, v1.8h\n"
    "ldr d25, [x11, x10]\n"
    "smlal v16.4s, v24.4h, v1.4h\n"
    "smlal2 v21.4s, v24.8h, v1.8h\n"
    "smlal v7.4s, v30.4h, v1.4h\n"
    "smlal2 v17.4s, v30.8h, v1.8h\n"
    "smlal v8.4s, v26.4h, v1.4h\n"
    "smlal2 v5.4s, v26.8h, v1.8h\n"
    "ldr d1, [x3, #0x80]\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v2.4h\n"
    "smlal2 v18.4s, v24.8h, v2.8h\n"
    "ldr d24, [x24, x10]\n"
    "smlal v16.4s, v27.4h, v2.4h\n"
    "smlal2 v21.4s, v27.8h, v2.8h\n"
    "smlal v7.4s, v26.4h, v2.4h\n"
    "smlal2 v17.4s, v26.8h, v2.8h\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ldr d2, [x3, #0x88]\n"
    "smlal v15.4s, v27.4h, v3.4h\n"
    "smlal2 v18.4s, v27.8h, v3.8h\n"
    "ldr d27, [x15, x10]\n"
    "smlal v16.4s, v23.4h, v3.4h\n"
    "smlal2 v21.4s, v23.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "usubl v22.8h, v22.8b, v9.8b\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "ldr d3, [x3, #0x90]\n"
    "smlal v15.4s, v23.4h, v4.4h\n"
    "smlal2 v18.4s, v23.8h, v4.8h\n"
    "ldr d23, [x9, x10]\n"
    "smlal v16.4s, v28.4h, v4.4h\n"
    "smlal2 v21.4s, v28.8h, v4.8h\n"
    "ldr d28, [x12, x10]\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "smlal v8.4s, v22.4h, v4.4h\n"
    "smlal2 v5.4s, v22.8h, v4.8h\n"
    "ldr d4, [x3, #0x98]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "smlal v15.4s, v31.4h, v0.4h\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "ldr d31, [x27, x10]\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "smlal v7.4s, v27.4h, v0.4h\n"
    "smlal2 v17.4s, v27.8h, v0.8h\n"
    "smlal v8.4s, v23.4h, v0.4h\n"
    "smlal2 v5.4s, v23.8h, v0.8h\n"
    "ldr d0, [x3, #0xa0]\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "ldr d30, [x28, x10]\n"
    "smlal v16.4s, v26.4h, v1.4h\n"
    "smlal2 v21.4s, v26.8h, v1.8h\n"
    "smlal v7.4s, v23.4h, v1.4h\n"
    "smlal2 v17.4s, v23.8h, v1.8h\n"
    "smlal v8.4s, v31.4h, v1.4h\n"
    "smlal2 v5.4s, v31.8h, v1.8h\n"
    "ldr d1, [x3, #0xa8]\n"
    "smlal v15.4s, v26.4h, v2.4h\n"
    "smlal2 v18.4s, v26.8h, v2.8h\n"
    "ldr d26, [x7, x10]\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "smlal v7.4s, v31.4h, v2.4h\n"
    "smlal2 v17.4s, v31.8h, v2.8h\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "smlal v8.4s, v30.4h, v2.4h\n"
    "smlal2 v5.4s, v30.8h, v2.8h\n"
    "ldr d2, [x3, #0xb0]\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "ldr d25, [x26, x10]\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "smlal v7.4s, v30.4h, v3.4h\n"
    "smlal2 v17.4s, v30.8h, v3.8h\n"
    "smlal v8.4s, v28.4h, v3.4h\n"
    "smlal2 v5.4s, v28.8h, v3.8h\n"
    "ldr d3, [x3, #0xb8]\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "ldr d24, [x23, x10]\n"
    "smlal v16.4s, v22.4h, v4.4h\n"
    "smlal2 v21.4s, v22.8h, v4.8h\n"
    "smlal v7.4s, v28.4h, v4.4h\n"
    "smlal2 v17.4s, v28.8h, v4.8h\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ldr d4, [x3, #0xc0]\n"
    "add x3, x3, #0xc8\n"
    "smlal v15.4s, v27.4h, v0.4h\n"
    "smlal2 v18.4s, v27.8h, v0.8h\n"
    "ldr d27, [x22, x10]\n"
    "smlal v16.4s, v23.4h, v0.4h\n"
    "smlal2 v21.4s, v23.8h, v0.8h\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v7.4s, v25.4h, v0.4h\n"
    "smlal2 v17.4s, v25.8h, v0.8h\n"
    "ldr d25, [x20, x10]\n"
    "smlal v8.4s, v24.4h, v0.4h\n"
    "smlal2 v5.4s, v24.8h, v0.8h\n"
    "smlal v15.4s, v23.4h, v1.4h\n"
    "smlal2 v18.4s, v23.8h, v1.8h\n"
    "smlal v16.4s, v31.4h, v1.4h\n"
    "smlal2 v21.4s, v31.8h, v1.8h\n"
    "smlal v7.4s, v24.4h, v1.4h\n"
    "smlal2 v17.4s, v24.8h, v1.8h\n"
    "ldr d24, [x13, x10]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "smlal v8.4s, v27.4h, v1.4h\n"
    "smlal2 v5.4s, v27.8h, v1.8h\n"
    "smlal v15.4s, v31.4h, v2.4h\n"
    "smlal2 v18.4s, v31.8h, v2.8h\n"
    "smlal v16.4s, v30.4h, v2.4h\n"
    "smlal2 v21.4s, v30.8h, v2.8h\n"
    "smlal v7.4s, v27.4h, v2.4h\n"
    "smlal2 v17.4s, v27.8h, v2.8h\n"
    "ldr d27, [x21, x10]\n"
    "add x10, x10, #0x8\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v3.4h\n"
    "smlal2 v18.4s, v30.8h, v3.8h\n"
    "smlal v16.4s, v28.4h, v3.4h\n"
    "smlal2 v21.4s, v28.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "smlal v15.4s, v28.4h, v4.4h\n"
    "smlal2 v18.4s, v28.8h, v4.8h\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
    "smlal v8.4s, v27.4h, v4.4h\n"
    "smlal2 v5.4s, v27.8h, v4.8h\n"
    "and v28.16b, v15.16b, v19.16b\n"
    "and v26.16b, v18.16b, v12.16b\n"
    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
    "sshr v28.4s, v28.4s, #0x1f\n"
    "sshr v26.4s, v26.4s, #0x1f\n"
    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
    "sqadd v15.4s, v15.4s, v28.4s\n"
    "sqadd v18.4s, v18.4s, v26.4s\n"
    "and v29.16b, v16.16b, v19.16b\n"
    "and v4.16b, v21.16b, v12.16b\n"
    "srshl v15.4s, v15.4s, v19.4s\n"
    "srshl v18.4s, v18.4s, v12.4s\n"
    "sshr v29.4s, v29.4s, #0x1f\n"
    "sshr v4.4s, v4.4s, #0x1f\n"
    "add v15.4s, v15.4s, v10.4s\n"
    "add v18.4s, v18.4s, v10.4s\n"
    "sqadd v16.4s, v16.4s, v29.4s\n"
    "smin v15.4s, v15.4s, v13.4s\n"
    "smin v18.4s, v18.4s, v13.4s\n"
    "sqadd v21.4s, v21.4s, v4.4s\n"
    "smax v15.4s, v15.4s, v11.4s\n"
    "smax v18.4s, v18.4s, v11.4s\n"
    "srshl v16.4s, v16.4s, v19.4s\n"
    "srshl v21.4s, v21.4s, v12.4s\n"
    "uzp1 v15.16b, v15.16b, v18.16b\n"
    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
    "uzp1 v15.16b, v15.16b, v15.16b\n"
    "str d15, [x17, x1]\n"
    "add v16.4s, v16.4s, v10.4s\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "and v25.16b, v7.16b, v19.16b\n"
    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
    "smin v16.4s, v16.4s, v13.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "sshr v25.4s, v25.4s, #0x1f\n"
    "smax v16.4s, v16.4s, v11.4s\n"
    "smax v21.4s, v21.4s, v11.4s\n"
    "sqadd v7.4s, v7.4s, v25.4s\n"
    "and v31.16b, v17.16b, v12.16b\n"
    "uzp1 v16.16b, v16.16b, v21.16b\n"
    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
    "uzp1 v16.16b, v16.16b, v16.16b\n"
    "str d16, [x16, x1]\n"
    "srshl v7.4s, v7.4s, v19.4s\n"
    "sshr v31.4s, v31.4s, #0x1f\n"
    "and v24.16b, v8.16b, v19.16b\n"
    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
    "sqadd v17.4s, v17.4s, v31.4s\n"
    "add v7.4s, v7.4s, v10.4s\n"
    "sshr v24.4s, v24.4s, #0x1f\n"
    "and v1.16b, v5.16b, v12.16b\n"
    "smin v7.4s, v7.4s, v13.4s\n"
    "srshl v17.4s, v17.4s, v12.4s\n"
    "sqadd v8.4s, v8.4s, v24.4s\n"
    "smax v7.4s, v7.4s, v11.4s\n"
    "sshr v1.4s, v1.4s, #0x1f\n"
    "add v17.4s, v17.4s, v10.4s\n"
    "srshl v8.4s, v8.4s, v19.4s\n"
    "sqadd v5.4s, v5.4s, v1.4s\n"
    "smin v17.4s, v17.4s, v13.4s\n"
    "add v8.4s, v8.4s, v10.4s\n"
    "smax v17.4s, v17.4s, v11.4s\n"
    "srshl v5.4s, v5.4s, v12.4s\n"
    "smin v8.4s, v8.4s, v13.4s\n"
    "uzp1 v7.16b, v7.16b, v17.16b\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "uzp1 v7.16b, v7.16b, v7.16b\n"
    "str d7, [x6, x1]\n"
    "smax v8.4s, v8.4s, v11.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "smax v5.4s, v5.4s, v11.4s\n"
    "uzp1 v8.16b, v8.16b, v5.16b\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "str d8, [x8, x1]\n"
    "add x1, x1, #0x8\n"
    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
    "ldr q15, [x12, #0x0]\n"
    "mov v16.16b, v15.16b\n"
    "ldr q18, [x12, #0x10]\n"
    "add x12, x12, #0x20\n"
    "mov v7.16b, v15.16b\n"
    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
    "mov v8.16b, v15.16b\n"
    "ldr d0, [x3, #0x0]\n"
    "ldr d1, [x3, #0x8]\n"
    "mov v21.16b, v18.16b\n"
    "ldr d2, [x3, #0x10]\n"
    "mov v17.16b, v18.16b\n"
    "ldr d3, [x3, #0x18]\n"
    "mov v5.16b, v18.16b\n"
    "ldr d4, [x3, #0x20]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "ldp x28, x27, [x25, #0x0]\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "ldp x26, x13, [x25, #0x10]\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "ldp x24, x23, [x25, #0x20]\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "ldp x22, x21, [x25, #0x30]\n"
    "ldp x20, x0, [x25, #0x40]\n"
    "ldr d31, [x28, x10]\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr d30, [x27, x10]\n"
    "ldr d29, [x26, x10]\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ldr d28, [x13, x10]\n"
    "usubl v29.8h, v29.8b, v9.8b\n"
    "ldr d27, [x24, x10]\n"
    "ldr d23, [x23, x10]\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ldr d25, [x22, x10]\n"
    "ldr d24, [x21, x10]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ldr d26, [x20, x10]\n"
    "ldr d22, [x0, x10]\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "usubl v22.8h, v22.8b, v9.8b\n"
    "bgt 1b\n"
    "2:"  // Tail
    "smlal v15.4s, v31.4h, v0.4h\n"
    "ldr x20, [x25, #0x50]\n"
    "tst x4, #0x7\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "ldr x28, [x25, #0x58]\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "ldr x0, [x25, #0x60]\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "ldr d31, [x20, x10]\n"
    "smlal v7.4s, v29.4h, v0.4h\n"
    "ldr x7, [x25, #0x68]\n"
    "smlal2 v17.4s, v29.8h, v0.8h\n"
    "ldr x26, [x25, #0x70]\n"
    "smlal v8.4s, v28.4h, v0.4h\n"
    "ldr x23, [x25, #0x78]\n"
    "smlal2 v5.4s, v28.8h, v0.8h\n"
    "ldr d0, [x3, #0x28]\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "ldr x20, [x25, #0x80]\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "ldr d30, [x28, x10]\n"
    "smlal v16.4s, v27.4h, v1.4h\n"
    "ldr x22, [x25, #0x88]\n"
    "smlal2 v21.4s, v27.8h, v1.8h\n"
    "ldr x13, [x25, #0x90]\n"
    "smlal v7.4s, v28.4h, v1.4h\n"
    "ldr x21, [x25, #0x98]\n"
    "smlal2 v17.4s, v28.8h, v1.8h\n"
    "ldr x14, [x25, #0xa0]\n"
    "smlal v8.4s, v23.4h, v1.4h\n"
    "ldr x11, [x25, #0xa8]\n"
    "smlal2 v5.4s, v23.8h, v1.8h\n"
    "ldr d1, [x3, #0x30]\n"
    "smlal v15.4s, v27.4h, v2.4h\n"
    "ldr x24, [x25, #0xb0]\n"
    "smlal2 v18.4s, v27.8h, v2.8h\n"
    "ldr d27, [x0, x10]\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "ldr x0, [x25, #0xb8]\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "ldr x15, [x25, #0xc0]\n"
    "smlal v7.4s, v23.4h, v2.4h\n"
    "ldr x9, [x25, #0xc8]\n"
    "smlal2 v17.4s, v23.8h, v2.8h\n"
    "ldr x27, [x25, #0xd0]\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr x28, [x25, #0xd8]\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "ldr x12, [x25, #0xe0]\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "ldr d25, [x7, x10]\n"
    "smlal v8.4s, v31.4h, v2.4h\n"
    "ldr x7, [x25, #0xe8]\n"
    "smlal2 v5.4s, v31.8h, v2.8h\n"
    "ldr d2, [x3, #0x38]\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "ldr q6, [x2, #0x0]\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "ldr q19, [x5, #0x0]\n"
    "smlal v7.4s, v31.4h, v3.4h\n"
    "ldr q20, [x2, #0x10]\n"
    "add x2, x2, #0x20\n"
    "smlal2 v17.4s, v31.8h, v3.8h\n"
    "ldr q12, [x5, #0x10]\n"
    "add x5, x5, #0x20\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "ldr d24, [x26, x10]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ldr x26, [x25, #0xf0]\n"
    "smlal v8.4s, v30.4h, v3.4h\n"
    "smlal2 v5.4s, v30.8h, v3.8h\n"
    "ldr d3, [x3, #0x40]\n"
    "smlal v16.4s, v27.4h, v4.4h\n"
    "smlal2 v21.4s, v27.8h, v4.8h\n"
    "ldr d27, [x23, x10]\n"
    "smlal v7.4s, v30.4h, v4.4h\n"
    "ldr x23, [x25, #0xf8]\n"
    "smlal2 v17.4s, v30.8h, v4.8h\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ldr d4, [x3, #0x48]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v29.4h, v0.4h\n"
    "smlal2 v18.4s, v29.8h, v0.8h\n"
    "smlal v16.4s, v28.4h, v0.4h\n"
    "smlal2 v21.4s, v28.8h, v0.8h\n"
    "smlal v7.4s, v22.4h, v0.4h\n"
    "smlal2 v17.4s, v22.8h, v0.8h\n"
    "smlal v8.4s, v25.4h, v0.4h\n"
    "smlal2 v5.4s, v25.8h, v0.8h\n"
    "ldr d0, [x3, #0x50]\n"
    "smlal v15.4s, v28.4h, v1.4h\n"
    "smlal2 v18.4s, v28.8h, v1.8h\n"
    "ldr d28, [x22, x10]\n"
    "smlal v16.4s, v23.4h, v1.4h\n"
    "ldr x22, [x25, #0x100]\n"
    "smlal2 v21.4s, v23.8h, v1.8h\n"
    "smlal v7.4s, v25.4h, v1.4h\n"
    "smlal2 v17.4s, v25.8h, v1.8h\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "smlal v8.4s, v24.4h, v1.4h\n"
    "smlal2 v5.4s, v24.8h, v1.8h\n"
    "ldr d1, [x3, #0x58]\n"
    "smlal v15.4s, v23.4h, v2.4h\n"
    "smlal2 v18.4s, v23.8h, v2.8h\n"
    "ldr d23, [x20, x10]\n"
    "smlal v16.4s, v31.4h, v2.4h\n"
    "ldr x20, [x25, #0x108]\n"
    "smlal2 v21.4s, v31.8h, v2.8h\n"
    "smlal v7.4s, v24.4h, v2.4h\n"
    "smlal2 v17.4s, v24.8h, v2.8h\n"
    "smlal v8.4s, v27.4h, v2.4h\n"
    "smlal2 v5.4s, v27.8h, v2.8h\n"
    "ldr d2, [x3, #0x60]\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v31.4h, v3.4h\n"
    "smlal2 v18.4s, v31.8h, v3.8h\n"
    "ldr d31, [x13, x10]\n"
    "smlal v16.4s, v30.4h, v3.4h\n"
    "ldr x13, [x25, #0x110]\n"
    "smlal2 v21.4s, v30.8h, v3.8h\n"
    "smlal v7.4s, v27.4h, v3.4h\n"
    "smlal2 v17.4s, v27.8h, v3.8h\n"
    "smlal v8.4s, v23.4h, v3.4h\n"
    "smlal2 v5.4s, v23.8h, v3.8h\n"
    "ldr d3, [x3, #0x68]\n"
    "smlal v15.4s, v30.4h, v4.4h\n"
    "smlal2 v18.4s, v30.8h, v4.8h\n"
    "ldr d30, [x21, x10]\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "ldr x21, [x25, #0x118]\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "ldr d26, [x14, x10]\n"
    "smlal v7.4s, v23.4h, v4.4h\n"
    "smlal2 v17.4s, v23.8h, v4.8h\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "smlal v8.4s, v28.4h, v4.4h\n"
    "smlal2 v5.4s, v28.8h, v4.8h\n"
    "ldr d4, [x3, #0x70]\n"
    "smlal v15.4s, v22.4h, v0.4h\n"
    "smlal2 v18.4s, v22.8h, v0.8h\n"
    "ldr d22, [x0, x10]\n"
    "smlal v16.4s, v25.4h, v0.4h\n"
    "smlal2 v21.4s, v25.8h, v0.8h\n"
    "smlal v7.4s, v31.4h, v0.4h\n"
    "smlal2 v17.4s, v31.8h, v0.8h\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "smlal v8.4s, v30.4h, v0.4h\n"
    "smlal2 v5.4s, v30.8h, v0.8h\n"
    "ldr d0, [x3, #0x78]\n"
    "smlal v15.4s, v25.4h, v1.4h\n"
    "smlal2 v18.4s, v25.8h, v1.8h\n"
    "ldr d25, [x11, x10]\n"
    "smlal v16.4s, v24.4h, v1.4h\n"
    "smlal2 v21.4s, v24.8h, v1.8h\n"
    "smlal v7.4s, v30.4h, v1.4h\n"
    "smlal2 v17.4s, v30.8h, v1.8h\n"
    "smlal v8.4s, v26.4h, v1.4h\n"
    "smlal2 v5.4s, v26.8h, v1.8h\n"
    "ldr d1, [x3, #0x80]\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v2.4h\n"
    "smlal2 v18.4s, v24.8h, v2.8h\n"
    "ldr d24, [x24, x10]\n"
    "smlal v16.4s, v27.4h, v2.4h\n"
    "smlal2 v21.4s, v27.8h, v2.8h\n"
    "smlal v7.4s, v26.4h, v2.4h\n"
    "smlal2 v17.4s, v26.8h, v2.8h\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ldr d2, [x3, #0x88]\n"
    "smlal v15.4s, v27.4h, v3.4h\n"
    "smlal2 v18.4s, v27.8h, v3.8h\n"
    "ldr d27, [x15, x10]\n"
    "smlal v16.4s, v23.4h, v3.4h\n"
    "smlal2 v21.4s, v23.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "usubl v22.8h, v22.8b, v9.8b\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "ldr d3, [x3, #0x90]\n"
    "smlal v15.4s, v23.4h, v4.4h\n"
    "smlal2 v18.4s, v23.8h, v4.8h\n"
    "ldr d23, [x9, x10]\n"
    "smlal v16.4s, v28.4h, v4.4h\n"
    "smlal2 v21.4s, v28.8h, v4.8h\n"
    "ldr d28, [x12, x10]\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "smlal v8.4s, v22.4h, v4.4h\n"
    "smlal2 v5.4s, v22.8h, v4.8h\n"
    "ldr d4, [x3, #0x98]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "smlal v15.4s, v31.4h, v0.4h\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "ldr d31, [x27, x10]\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "smlal v7.4s, v27.4h, v0.4h\n"
    "smlal2 v17.4s, v27.8h, v0.8h\n"
    "smlal v8.4s, v23.4h, v0.4h\n"
    "smlal2 v5.4s, v23.8h, v0.8h\n"
    "ldr d0, [x3, #0xa0]\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "ldr d30, [x28, x10]\n"
    "smlal v16.4s, v26.4h, v1.4h\n"
    "smlal2 v21.4s, v26.8h, v1.8h\n"
    "smlal v7.4s, v23.4h, v1.4h\n"
    "smlal2 v17.4s, v23.8h, v1.8h\n"
    "smlal v8.4s, v31.4h, v1.4h\n"
    "smlal2 v5.4s, v31.8h, v1.8h\n"
    "ldr d1, [x3, #0xa8]\n"
    "smlal v15.4s, v26.4h, v2.4h\n"
    "smlal2 v18.4s, v26.8h, v2.8h\n"
    "ldr d26, [x7, x10]\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "smlal v7.4s, v31.4h, v2.4h\n"
    "smlal2 v17.4s, v31.8h, v2.8h\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "smlal v8.4s, v30.4h, v2.4h\n"
    "smlal2 v5.4s, v30.8h, v2.8h\n"
    "ldr d2, [x3, #0xb0]\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "ldr d25, [x26, x10]\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "smlal v7.4s, v30.4h, v3.4h\n"
    "smlal2 v17.4s, v30.8h, v3.8h\n"
    "smlal v8.4s, v28.4h, v3.4h\n"
    "smlal2 v5.4s, v28.8h, v3.8h\n"
    "ldr d3, [x3, #0xb8]\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "ldr d24, [x23, x10]\n"
    "smlal v16.4s, v22.4h, v4.4h\n"
    "smlal2 v21.4s, v22.8h, v4.8h\n"
    "smlal v7.4s, v28.4h, v4.4h\n"
    "smlal2 v17.4s, v28.8h, v4.8h\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ldr d4, [x3, #0xc0]\n"
    "smlal v15.4s, v27.4h, v0.4h\n"
    "smlal2 v18.4s, v27.8h, v0.8h\n"
    "ldr d27, [x22, x10]\n"
    "smlal v16.4s, v23.4h, v0.4h\n"
    "smlal2 v21.4s, v23.8h, v0.8h\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v7.4s, v25.4h, v0.4h\n"
    "smlal2 v17.4s, v25.8h, v0.8h\n"
    "ldr d25, [x20, x10]\n"
    "smlal v8.4s, v24.4h, v0.4h\n"
    "smlal2 v5.4s, v24.8h, v0.8h\n"
    "smlal v15.4s, v23.4h, v1.4h\n"
    "smlal2 v18.4s, v23.8h, v1.8h\n"
    "smlal v16.4s, v31.4h, v1.4h\n"
    "smlal2 v21.4s, v31.8h, v1.8h\n"
    "smlal v7.4s, v24.4h, v1.4h\n"
    "smlal2 v17.4s, v24.8h, v1.8h\n"
    "ldr d24, [x13, x10]\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "smlal v8.4s, v27.4h, v1.4h\n"
    "smlal2 v5.4s, v27.8h, v1.8h\n"
    "smlal v15.4s, v31.4h, v2.4h\n"
    "smlal2 v18.4s, v31.8h, v2.8h\n"
    "smlal v16.4s, v30.4h, v2.4h\n"
    "smlal2 v21.4s, v30.8h, v2.8h\n"
    "smlal v7.4s, v27.4h, v2.4h\n"
    "smlal2 v17.4s, v27.8h, v2.8h\n"
    "ldr d27, [x21, x10]\n"
    "add x10, x10, #0x8\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v3.4h\n"
    "smlal2 v18.4s, v30.8h, v3.8h\n"
    "smlal v16.4s, v28.4h, v3.4h\n"
    "smlal2 v21.4s, v28.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "smlal v15.4s, v28.4h, v4.4h\n"
    "smlal2 v18.4s, v28.8h, v4.8h\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
    "smlal v8.4s, v27.4h, v4.4h\n"
    "smlal2 v5.4s, v27.8h, v4.8h\n"
    "and v28.16b, v15.16b, v19.16b\n"
    "and v26.16b, v18.16b, v12.16b\n"
    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
    "sshr v28.4s, v28.4s, #0x1f\n"
    "sshr v26.4s, v26.4s, #0x1f\n"
    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
    "sqadd v15.4s, v15.4s, v28.4s\n"
    "sqadd v18.4s, v18.4s, v26.4s\n"
    "and v29.16b, v16.16b, v19.16b\n"
    "and v4.16b, v21.16b, v12.16b\n"
    "srshl v15.4s, v15.4s, v19.4s\n"
    "srshl v18.4s, v18.4s, v12.4s\n"
    "sshr v29.4s, v29.4s, #0x1f\n"
    "sshr v4.4s, v4.4s, #0x1f\n"
    "add v15.4s, v15.4s, v10.4s\n"
    "add v18.4s, v18.4s, v10.4s\n"
    "sqadd v16.4s, v16.4s, v29.4s\n"
    "smin v15.4s, v15.4s, v13.4s\n"
    "smin v18.4s, v18.4s, v13.4s\n"
    "sqadd v21.4s, v21.4s, v4.4s\n"
    "smax v15.4s, v15.4s, v11.4s\n"
    "smax v18.4s, v18.4s, v11.4s\n"
    "srshl v16.4s, v16.4s, v19.4s\n"
    "srshl v21.4s, v21.4s, v12.4s\n"
    "uzp1 v15.16b, v15.16b, v18.16b\n"
    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
    "uzp1 v15.16b, v15.16b, v15.16b\n"
    "str d15, [x17, x1]\n"
    "add v16.4s, v16.4s, v10.4s\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "and v25.16b, v7.16b, v19.16b\n"
    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
    "smin v16.4s, v16.4s, v13.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "sshr v25.4s, v25.4s, #0x1f\n"
    "smax v16.4s, v16.4s, v11.4s\n"
    "smax v21.4s, v21.4s, v11.4s\n"
    "sqadd v7.4s, v7.4s, v25.4s\n"
    "and v31.16b, v17.16b, v12.16b\n"
    "uzp1 v16.16b, v16.16b, v21.16b\n"
    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
    "uzp1 v16.16b, v16.16b, v16.16b\n"
    "str d16, [x16, x1]\n"
    "srshl v7.4s, v7.4s, v19.4s\n"
    "sshr v31.4s, v31.4s, #0x1f\n"
    "and v24.16b, v8.16b, v19.16b\n"
    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
    "sqadd v17.4s, v17.4s, v31.4s\n"
    "add v7.4s, v7.4s, v10.4s\n"
    "sshr v24.4s, v24.4s, #0x1f\n"
    "and v1.16b, v5.16b, v12.16b\n"
    "smin v7.4s, v7.4s, v13.4s\n"
    "srshl v17.4s, v17.4s, v12.4s\n"
    "sqadd v8.4s, v8.4s, v24.4s\n"
    "smax v7.4s, v7.4s, v11.4s\n"
    "sshr v1.4s, v1.4s, #0x1f\n"
    "add v17.4s, v17.4s, v10.4s\n"
    "srshl v8.4s, v8.4s, v19.4s\n"
    "sqadd v5.4s, v5.4s, v1.4s\n"
    "smin v17.4s, v17.4s, v13.4s\n"
    "add v8.4s, v8.4s, v10.4s\n"
    "smax v17.4s, v17.4s, v11.4s\n"
    "srshl v5.4s, v5.4s, v12.4s\n"
    "smin v8.4s, v8.4s, v13.4s\n"
    "uzp1 v7.16b, v7.16b, v17.16b\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "uzp1 v7.16b, v7.16b, v7.16b\n"
    "str d7, [x6, x1]\n"
    "smax v8.4s, v8.4s, v11.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "smax v5.4s, v5.4s, v11.4s\n"
    "uzp1 v8.16b, v8.16b, v5.16b\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "str d8, [x8, x1]\n"
    "add x1, x1, #0x8\n"
    "beq 124f\n"
    "add x3, x3, #0xc8\n"
    "3:"  // Oddments
    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
    "tbz x4, #2, 5f\n"
    "ld1 { v15.4s }, [x12], #0x10\n"
    "tbz x4, #1, 4f\n"
    "ld1 { v18.d }[0], [x12], #0x8\n"
    "tbz x4, #0, 7f\n"
    "ld1 { v18.s }[2], [x12]\n"
    "b 7f\n"
    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
    "tbz x4, #0, 7f\n"
    "ld1 { v18.s }[0], [x12]\n"
    "b 7f\n"
    "5:"  // Oddments: Load bias: Bit 2: Unset
    "tbz x4, #1, 6f\n"
    "ld1 { v15.d }[0], [x12], #0x8\n"
    "tbz x4, #0, 7f\n"
    "ld1 { v15.s }[2], [x12]\n"
    "b 7f\n"
    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 7f\n"
    "ld1 { v15.s }[0], [x12]\n"
    "7:"  // Oddments: Load bias: Bit 2: End
    "mov v16.16b, v15.16b\n"
    "ldr d0, [x3, #0x0]\n"
    "mov v21.16b, v18.16b\n"
    "ldr d1, [x3, #0x8]\n"
    "mov v7.16b, v15.16b\n"
    "ldr d2, [x3, #0x10]\n"
    "mov v17.16b, v18.16b\n"
    "ldr d3, [x3, #0x18]\n"
    "mov v8.16b, v15.16b\n"
    "ldr d4, [x3, #0x20]\n"
    "mov v5.16b, v18.16b\n"
    "ldp x28, x27, [x25, #0x0]\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "ldp x26, x13, [x25, #0x10]\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "ldp x24, x23, [x25, #0x20]\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "ldp x22, x21, [x25, #0x30]\n"
    "ldp x20, x0, [x25, #0x40]\n"
    "add x28, x28, x10\n"
    "add x27, x27, x10\n"
    "add x26, x26, x10\n"
    "add x13, x13, x10\n"
    "add x24, x24, x10\n"
    "add x23, x23, x10\n"
    "add x22, x22, x10\n"
    "add x21, x21, x10\n"
    "add x20, x20, x10\n"
    "add x0, x0, x10\n"
    "tbz x4, #2, 9f\n"
    "ld1 { v31.s }[0], [x28], #0x4\n"
    "ld1 { v30.s }[0], [x27], #0x4\n"
    "ld1 { v29.s }[0], [x26], #0x4\n"
    "ld1 { v28.s }[0], [x13], #0x4\n"
    "ld1 { v27.s }[0], [x24], #0x4\n"
    "ld1 { v23.s }[0], [x23], #0x4\n"
    "ld1 { v25.s }[0], [x22], #0x4\n"
    "ld1 { v24.s }[0], [x21], #0x4\n"
    "ld1 { v26.s }[0], [x20], #0x4\n"
    "ld1 { v22.s }[0], [x0], #0x4\n"
    "tbz x4, #1, 8f\n"
    "ld1 { v31.h }[2], [x28], #0x2\n"
    "ld1 { v30.h }[2], [x27], #0x2\n"
    "ld1 { v29.h }[2], [x26], #0x2\n"
    "ld1 { v28.h }[2], [x13], #0x2\n"
    "ld1 { v27.h }[2], [x24], #0x2\n"
    "ld1 { v23.h }[2], [x23], #0x2\n"
    "ld1 { v25.h }[2], [x22], #0x2\n"
    "ld1 { v24.h }[2], [x21], #0x2\n"
    "ld1 { v26.h }[2], [x20], #0x2\n"
    "ld1 { v22.h }[2], [x0], #0x2\n"
    "tbz x4, #0, 11f\n"
    "ld1 { v31.b }[6], [x28]\n"
    "ld1 { v30.b }[6], [x27]\n"
    "ld1 { v29.b }[6], [x26]\n"
    "ld1 { v28.b }[6], [x13]\n"
    "ld1 { v27.b }[6], [x24]\n"
    "ld1 { v23.b }[6], [x23]\n"
    "ld1 { v25.b }[6], [x22]\n"
    "ld1 { v24.b }[6], [x21]\n"
    "ld1 { v26.b }[6], [x20]\n"
    "ld1 { v22.b }[6], [x0]\n"
    "b 11f\n"
    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
    "tbz x4, #0, 11f\n"
    "ld1 { v31.b }[4], [x28]\n"
    "ld1 { v30.b }[4], [x27]\n"
    "ld1 { v29.b }[4], [x26]\n"
    "ld1 { v28.b }[4], [x13]\n"
    "ld1 { v27.b }[4], [x24]\n"
    "ld1 { v23.b }[4], [x23]\n"
    "ld1 { v25.b }[4], [x22]\n"
    "ld1 { v24.b }[4], [x21]\n"
    "ld1 { v26.b }[4], [x20]\n"
    "ld1 { v22.b }[4], [x0]\n"
    "b 11f\n"
    "9:"  // Oddments: Initial loads: Bit 2: Unset
    "tbz x4, #1, 10f\n"
    "ld1 { v31.h }[0], [x28], #0x2\n"
    "ld1 { v30.h }[0], [x27], #0x2\n"
    "ld1 { v29.h }[0], [x26], #0x2\n"
    "ld1 { v28.h }[0], [x13], #0x2\n"
    "ld1 { v27.h }[0], [x24], #0x2\n"
    "ld1 { v23.h }[0], [x23], #0x2\n"
    "ld1 { v25.h }[0], [x22], #0x2\n"
    "ld1 { v24.h }[0], [x21], #0x2\n"
    "ld1 { v26.h }[0], [x20], #0x2\n"
    "ld1 { v22.h }[0], [x0], #0x2\n"
    "tbz x4, #0, 11f\n"
    "ld1 { v31.b }[2], [x28]\n"
    "ld1 { v30.b }[2], [x27]\n"
    "ld1 { v29.b }[2], [x26]\n"
    "ld1 { v28.b }[2], [x13]\n"
    "ld1 { v27.b }[2], [x24]\n"
    "ld1 { v23.b }[2], [x23]\n"
    "ld1 { v25.b }[2], [x22]\n"
    "ld1 { v24.b }[2], [x21]\n"
    "ld1 { v26.b }[2], [x20]\n"
    "ld1 { v22.b }[2], [x0]\n"
    "b 11f\n"
    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 11f\n"
    "ld1 { v31.b }[0], [x28]\n"
    "ld1 { v30.b }[0], [x27]\n"
    "ld1 { v29.b }[0], [x26]\n"
    "ld1 { v28.b }[0], [x13]\n"
    "ld1 { v27.b }[0], [x24]\n"
    "ld1 { v23.b }[0], [x23]\n"
    "ld1 { v25.b }[0], [x22]\n"
    "ld1 { v24.b }[0], [x21]\n"
    "ld1 { v26.b }[0], [x20]\n"
    "ld1 { v22.b }[0], [x0]\n"
    "11:"  // Oddments: Initial loads: Bit 2: End
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr x20, [x25, #0x50]\n"
    "add x20, x20, x10\n"
    "usubl v30.8h, v30.8b, v9.8b\n"
    "usubl v29.8h, v29.8b, v9.8b\n"
    "usubl v28.8h, v28.8b, v9.8b\n"
    "usubl v27.8h, v27.8b, v9.8b\n"
    "usubl v23.8h, v23.8b, v9.8b\n"
    "usubl v25.8h, v25.8b, v9.8b\n"
    "usubl v24.8h, v24.8b, v9.8b\n"
    "usubl v26.8h, v26.8b, v9.8b\n"
    "usubl v22.8h, v22.8b, v9.8b\n"
    "smlal v15.4s, v31.4h, v0.4h\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "smlal v7.4s, v29.4h, v0.4h\n"
    "smlal2 v17.4s, v29.8h, v0.8h\n"
    "smlal v8.4s, v28.4h, v0.4h\n"
    "smlal2 v5.4s, v28.8h, v0.8h\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "smlal v16.4s, v27.4h, v1.4h\n"
    "smlal2 v21.4s, v27.8h, v1.8h\n"
    "smlal v7.4s, v28.4h, v1.4h\n"
    "smlal2 v17.4s, v28.8h, v1.8h\n"
    "smlal v8.4s, v23.4h, v1.4h\n"
    "smlal2 v5.4s, v23.8h, v1.8h\n"
    "smlal v15.4s, v27.4h, v2.4h\n"
    "smlal2 v18.4s, v27.8h, v2.8h\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "smlal v7.4s, v23.4h, v2.4h\n"
    "smlal2 v17.4s, v23.8h, v2.8h\n"
    "tbz x4, #2, 13f\n"
    "ld1 { v31.s }[0], [x20], #0x4\n"
    "tbz x4, #1, 12f\n"
    "ld1 { v31.h }[2], [x20], #0x2\n"
    "tbz x4, #0, 15f\n"
    "ld1 { v31.b }[6], [x20]\n"
    "b 15f\n"
    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
    "tbz x4, #0, 15f\n"
    "ld1 { v31.b }[4], [x20]\n"
    "b 15f\n"
    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
    "tbz x4, #1, 14f\n"
    "ld1 { v31.h }[0], [x20], #0x2\n"
    "tbz x4, #0, 15f\n"
    "ld1 { v31.b }[2], [x20]\n"
    "b 15f\n"
    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 15f\n"
    "ld1 { v31.b }[0], [x20]\n"
    "15:"  // Oddments: Load (1, 3): Bit 2: End
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr x28, [x25, #0x58]\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "add x28, x28, x10\n"
    "smlal v8.4s, v31.4h, v2.4h\n"
    "smlal2 v5.4s, v31.8h, v2.8h\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "smlal v7.4s, v31.4h, v3.4h\n"
    "smlal2 v17.4s, v31.8h, v3.8h\n"
    "tbz x4, #2, 17f\n"
    "ld1 { v30.s }[0], [x28], #0x4\n"
    "tbz x4, #1, 16f\n"
    "ld1 { v30.h }[2], [x28], #0x2\n"
    "tbz x4, #0, 19f\n"
    "ld1 { v30.b }[6], [x28]\n"
    "b 19f\n"
    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
    "tbz x4, #0, 19f\n"
    "ld1 { v30.b }[4], [x28]\n"
    "b 19f\n"
    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
    "tbz x4, #1, 18f\n"
    "ld1 { v30.h }[0], [x28], #0x2\n"
    "tbz x4, #0, 19f\n"
    "ld1 { v30.b }[2], [x28]\n"
    "b 19f\n"
    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 19f\n"
    "ld1 { v30.b }[0], [x28]\n"
    "19:"  // Oddments: Load (1, 4): Bit 2: End
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ldr x0, [x25, #0x60]\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "add x0, x0, x10\n"
    "smlal v8.4s, v30.4h, v3.4h\n"
    "smlal2 v5.4s, v30.8h, v3.8h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "tbz x4, #2, 21f\n"
    "ld1 { v27.s }[0], [x0], #0x4\n"
    "tbz x4, #1, 20f\n"
    "ld1 { v27.h }[2], [x0], #0x2\n"
    "tbz x4, #0, 23f\n"
    "ld1 { v27.b }[6], [x0]\n"
    "b 23f\n"
    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
    "tbz x4, #0, 23f\n"
    "ld1 { v27.b }[4], [x0]\n"
    "b 23f\n"
    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
    "tbz x4, #1, 22f\n"
    "ld1 { v27.h }[0], [x0], #0x2\n"
    "tbz x4, #0, 23f\n"
    "ld1 { v27.b }[2], [x0]\n"
    "b 23f\n"
    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 23f\n"
    "ld1 { v27.b }[0], [x0]\n"
    "23:"  // Oddments: Load (0, 5): Bit 2: End
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ldr d0, [x3, #0x28]\n"
    "smlal v7.4s, v30.4h, v4.4h\n"
    "ldr x7, [x25, #0x68]\n"
    "add x7, x7, x10\n"
    "smlal v16.4s, v27.4h, v4.4h\n"
    "smlal2 v21.4s, v27.8h, v4.8h\n"
    "smlal2 v17.4s, v30.8h, v4.8h\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v29.4h, v0.4h\n"
    "smlal2 v18.4s, v29.8h, v0.8h\n"
    "smlal v16.4s, v28.4h, v0.4h\n"
    "smlal2 v21.4s, v28.8h, v0.8h\n"
    "smlal v7.4s, v22.4h, v0.4h\n"
    "smlal2 v17.4s, v22.8h, v0.8h\n"
    "tbz x4, #2, 25f\n"
    "ld1 { v25.s }[0], [x7], #0x4\n"
    "tbz x4, #1, 24f\n"
    "ld1 { v25.h }[2], [x7], #0x2\n"
    "tbz x4, #0, 27f\n"
    "ld1 { v25.b }[6], [x7]\n"
    "b 27f\n"
    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
    "tbz x4, #0, 27f\n"
    "ld1 { v25.b }[4], [x7]\n"
    "b 27f\n"
    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
    "tbz x4, #1, 26f\n"
    "ld1 { v25.h }[0], [x7], #0x2\n"
    "tbz x4, #0, 27f\n"
    "ld1 { v25.b }[2], [x7]\n"
    "b 27f\n"
    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 27f\n"
    "ld1 { v25.b }[0], [x7]\n"
    "27:"  // Oddments: Load (2, 1): Bit 2: End
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ldr d1, [x3, #0x30]\n"
    "smlal v8.4s, v25.4h, v0.4h\n"
    "ldr x26, [x25, #0x70]\n"
    "add x26, x26, x10\n"
    "smlal2 v5.4s, v25.8h, v0.8h\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v28.4h, v1.4h\n"
    "smlal2 v18.4s, v28.8h, v1.8h\n"
    "smlal v16.4s, v23.4h, v1.4h\n"
    "smlal2 v21.4s, v23.8h, v1.8h\n"
    "smlal v7.4s, v25.4h, v1.4h\n"
    "smlal2 v17.4s, v25.8h, v1.8h\n"
    "tbz x4, #2, 29f\n"
    "ld1 { v24.s }[0], [x26], #0x4\n"
    "tbz x4, #1, 28f\n"
    "ld1 { v24.h }[2], [x26], #0x2\n"
    "tbz x4, #0, 31f\n"
    "ld1 { v24.b }[6], [x26]\n"
    "b 31f\n"
    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
    "tbz x4, #0, 31f\n"
    "ld1 { v24.b }[4], [x26]\n"
    "b 31f\n"
    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
    "tbz x4, #1, 30f\n"
    "ld1 { v24.h }[0], [x26], #0x2\n"
    "tbz x4, #0, 31f\n"
    "ld1 { v24.b }[2], [x26]\n"
    "b 31f\n"
    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 31f\n"
    "ld1 { v24.b }[0], [x26]\n"
    "31:"  // Oddments: Load (2, 2): Bit 2: End
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ldr d2, [x3, #0x38]\n"
    "smlal v8.4s, v24.4h, v1.4h\n"
    "ldr x23, [x25, #0x78]\n"
    "add x23, x23, x10\n"
    "smlal2 v5.4s, v24.8h, v1.8h\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v23.4h, v2.4h\n"
    "smlal2 v18.4s, v23.8h, v2.8h\n"
    "smlal v16.4s, v31.4h, v2.4h\n"
    "smlal2 v21.4s, v31.8h, v2.8h\n"
    "smlal v7.4s, v24.4h, v2.4h\n"
    "smlal2 v17.4s, v24.8h, v2.8h\n"
    "tbz x4, #2, 33f\n"
    "ld1 { v27.s }[0], [x23], #0x4\n"
    "tbz x4, #1, 32f\n"
    "ld1 { v27.h }[2], [x23], #0x2\n"
    "tbz x4, #0, 35f\n"
    "ld1 { v27.b }[6], [x23]\n"
    "b 35f\n"
    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
    "tbz x4, #0, 35f\n"
    "ld1 { v27.b }[4], [x23]\n"
    "b 35f\n"
    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
    "tbz x4, #1, 34f\n"
    "ld1 { v27.h }[0], [x23], #0x2\n"
    "tbz x4, #0, 35f\n"
    "ld1 { v27.b }[2], [x23]\n"
    "b 35f\n"
    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 35f\n"
    "ld1 { v27.b }[0], [x23]\n"
    "35:"  // Oddments: Load (2, 3): Bit 2: End
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ldr d3, [x3, #0x40]\n"
    "smlal v8.4s, v27.4h, v2.4h\n"
    "ldr x20, [x25, #0x80]\n"
    "add x20, x20, x10\n"
    "smlal2 v5.4s, v27.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v31.4h, v3.4h\n"
    "smlal2 v18.4s, v31.8h, v3.8h\n"
    "smlal v16.4s, v30.4h, v3.4h\n"
    "smlal2 v21.4s, v30.8h, v3.8h\n"
    "smlal v7.4s, v27.4h, v3.4h\n"
    "smlal2 v17.4s, v27.8h, v3.8h\n"
    "tbz x4, #2, 37f\n"
    "ld1 { v23.s }[0], [x20], #0x4\n"
    "tbz x4, #1, 36f\n"
    "ld1 { v23.h }[2], [x20], #0x2\n"
    "tbz x4, #0, 39f\n"
    "ld1 { v23.b }[6], [x20]\n"
    "b 39f\n"
    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
    "tbz x4, #0, 39f\n"
    "ld1 { v23.b }[4], [x20]\n"
    "b 39f\n"
    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
    "tbz x4, #1, 38f\n"
    "ld1 { v23.h }[0], [x20], #0x2\n"
    "tbz x4, #0, 39f\n"
    "ld1 { v23.b }[2], [x20]\n"
    "b 39f\n"
    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 39f\n"
    "ld1 { v23.b }[0], [x20]\n"
    "39:"  // Oddments: Load (2, 4): Bit 2: End
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ldr d4, [x3, #0x48]\n"
    "smlal v8.4s, v23.4h, v3.4h\n"
    "ldr x22, [x25, #0x88]\n"
    "add x22, x22, x10\n"
    "smlal2 v5.4s, v23.8h, v3.8h\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v4.4h\n"
    "smlal2 v18.4s, v30.8h, v4.8h\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "smlal v7.4s, v23.4h, v4.4h\n"
    "smlal2 v17.4s, v23.8h, v4.8h\n"
    "tbz x4, #2, 41f\n"
    "ld1 { v28.s }[0], [x22], #0x4\n"
    "tbz x4, #1, 40f\n"
    "ld1 { v28.h }[2], [x22], #0x2\n"
    "tbz x4, #0, 43f\n"
    "ld1 { v28.b }[6], [x22]\n"
    "b 43f\n"
    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
    "tbz x4, #0, 43f\n"
    "ld1 { v28.b }[4], [x22]\n"
    "b 43f\n"
    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
    "tbz x4, #1, 42f\n"
    "ld1 { v28.h }[0], [x22], #0x2\n"
    "tbz x4, #0, 43f\n"
    "ld1 { v28.b }[2], [x22]\n"
    "b 43f\n"
    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 43f\n"
    "ld1 { v28.b }[0], [x22]\n"
    "43:"  // Oddments: Load (2, 5): Bit 2: End
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ldr d0, [x3, #0x50]\n"
    "smlal v8.4s, v28.4h, v4.4h\n"
    "ldr x13, [x25, #0x90]\n"
    "add x13, x13, x10\n"
    "smlal2 v5.4s, v28.8h, v4.8h\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v22.4h, v0.4h\n"
    "smlal2 v18.4s, v22.8h, v0.8h\n"
    "smlal v16.4s, v25.4h, v0.4h\n"
    "smlal2 v21.4s, v25.8h, v0.8h\n"
    "tbz x4, #2, 45f\n"
    "ld1 { v31.s }[0], [x13], #0x4\n"
    "tbz x4, #1, 44f\n"
    "ld1 { v31.h }[2], [x13], #0x2\n"
    "tbz x4, #0, 47f\n"
    "ld1 { v31.b }[6], [x13]\n"
    "b 47f\n"
    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
    "tbz x4, #0, 47f\n"
    "ld1 { v31.b }[4], [x13]\n"
    "b 47f\n"
    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
    "tbz x4, #1, 46f\n"
    "ld1 { v31.h }[0], [x13], #0x2\n"
    "tbz x4, #0, 47f\n"
    "ld1 { v31.b }[2], [x13]\n"
    "b 47f\n"
    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 47f\n"
    "ld1 { v31.b }[0], [x13]\n"
    "47:"  // Oddments: Load (3, 0): Bit 2: End
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr x21, [x25, #0x98]\n"
    "smlal v7.4s, v31.4h, v0.4h\n"
    "add x21, x21, x10\n"
    "smlal2 v17.4s, v31.8h, v0.8h\n"
    "tbz x4, #2, 49f\n"
    "ld1 { v30.s }[0], [x21], #0x4\n"
    "tbz x4, #1, 48f\n"
    "ld1 { v30.h }[2], [x21], #0x2\n"
    "tbz x4, #0, 51f\n"
    "ld1 { v30.b }[6], [x21]\n"
    "b 51f\n"
    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
    "tbz x4, #0, 51f\n"
    "ld1 { v30.b }[4], [x21]\n"
    "b 51f\n"
    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
    "tbz x4, #1, 50f\n"
    "ld1 { v30.h }[0], [x21], #0x2\n"
    "tbz x4, #0, 51f\n"
    "ld1 { v30.b }[2], [x21]\n"
    "b 51f\n"
    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 51f\n"
    "ld1 { v30.b }[0], [x21]\n"
    "51:"  // Oddments: Load (3, 1): Bit 2: End
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ldr d1, [x3, #0x58]\n"
    "smlal v8.4s, v30.4h, v0.4h\n"
    "ldr x14, [x25, #0xa0]\n"
    "add x14, x14, x10\n"
    "smlal2 v5.4s, v30.8h, v0.8h\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v25.4h, v1.4h\n"
    "smlal2 v18.4s, v25.8h, v1.8h\n"
    "smlal v16.4s, v24.4h, v1.4h\n"
    "smlal2 v21.4s, v24.8h, v1.8h\n"
    "smlal v7.4s, v30.4h, v1.4h\n"
    "smlal2 v17.4s, v30.8h, v1.8h\n"
    "tbz x4, #2, 53f\n"
    "ld1 { v26.s }[0], [x14], #0x4\n"
    "tbz x4, #1, 52f\n"
    "ld1 { v26.h }[2], [x14], #0x2\n"
    "tbz x4, #0, 55f\n"
    "ld1 { v26.b }[6], [x14]\n"
    "b 55f\n"
    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
    "tbz x4, #0, 55f\n"
    "ld1 { v26.b }[4], [x14]\n"
    "b 55f\n"
    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
    "tbz x4, #1, 54f\n"
    "ld1 { v26.h }[0], [x14], #0x2\n"
    "tbz x4, #0, 55f\n"
    "ld1 { v26.b }[2], [x14]\n"
    "b 55f\n"
    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 55f\n"
    "ld1 { v26.b }[0], [x14]\n"
    "55:"  // Oddments: Load (3, 2): Bit 2: End
    "usubl v26.8h, v26.8b, v9.8b\n"
    "ldr d2, [x3, #0x60]\n"
    "smlal v8.4s, v26.4h, v1.4h\n"
    "ldr x11, [x25, #0xa8]\n"
    "add x11, x11, x10\n"
    "smlal2 v5.4s, v26.8h, v1.8h\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v2.4h\n"
    "smlal2 v18.4s, v24.8h, v2.8h\n"
    "smlal v16.4s, v27.4h, v2.4h\n"
    "smlal2 v21.4s, v27.8h, v2.8h\n"
    "smlal v7.4s, v26.4h, v2.4h\n"
    "smlal2 v17.4s, v26.8h, v2.8h\n"
    "tbz x4, #2, 57f\n"
    "ld1 { v25.s }[0], [x11], #0x4\n"
    "tbz x4, #1, 56f\n"
    "ld1 { v25.h }[2], [x11], #0x2\n"
    "tbz x4, #0, 59f\n"
    "ld1 { v25.b }[6], [x11]\n"
    "b 59f\n"
    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
    "tbz x4, #0, 59f\n"
    "ld1 { v25.b }[4], [x11]\n"
    "b 59f\n"
    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
    "tbz x4, #1, 58f\n"
    "ld1 { v25.h }[0], [x11], #0x2\n"
    "tbz x4, #0, 59f\n"
    "ld1 { v25.b }[2], [x11]\n"
    "b 59f\n"
    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 59f\n"
    "ld1 { v25.b }[0], [x11]\n"
    "59:"  // Oddments: Load (3, 3): Bit 2: End
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ldr d3, [x3, #0x68]\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "ldr x24, [x25, #0xb0]\n"
    "add x24, x24, x10\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v27.4h, v3.4h\n"
    "smlal2 v18.4s, v27.8h, v3.8h\n"
    "smlal v16.4s, v23.4h, v3.4h\n"
    "smlal2 v21.4s, v23.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "tbz x4, #2, 61f\n"
    "ld1 { v24.s }[0], [x24], #0x4\n"
    "tbz x4, #1, 60f\n"
    "ld1 { v24.h }[2], [x24], #0x2\n"
    "tbz x4, #0, 63f\n"
    "ld1 { v24.b }[6], [x24]\n"
    "b 63f\n"
    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
    "tbz x4, #0, 63f\n"
    "ld1 { v24.b }[4], [x24]\n"
    "b 63f\n"
    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
    "tbz x4, #1, 62f\n"
    "ld1 { v24.h }[0], [x24], #0x2\n"
    "tbz x4, #0, 63f\n"
    "ld1 { v24.b }[2], [x24]\n"
    "b 63f\n"
    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 63f\n"
    "ld1 { v24.b }[0], [x24]\n"
    "63:"  // Oddments: Load (3, 4): Bit 2: End
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ldr d4, [x3, #0x70]\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "ldr x0, [x25, #0xb8]\n"
    "add x0, x0, x10\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v23.4h, v4.4h\n"
    "smlal2 v18.4s, v23.8h, v4.8h\n"
    "smlal v16.4s, v28.4h, v4.4h\n"
    "smlal2 v21.4s, v28.8h, v4.8h\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "tbz x4, #2, 65f\n"
    "ld1 { v22.s }[0], [x0], #0x4\n"
    "tbz x4, #1, 64f\n"
    "ld1 { v22.h }[2], [x0], #0x2\n"
    "tbz x4, #0, 67f\n"
    "ld1 { v22.b }[6], [x0]\n"
    "b 67f\n"
    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
    "tbz x4, #0, 67f\n"
    "ld1 { v22.b }[4], [x0]\n"
    "b 67f\n"
    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
    "tbz x4, #1, 66f\n"
    "ld1 { v22.h }[0], [x0], #0x2\n"
    "tbz x4, #0, 67f\n"
    "ld1 { v22.b }[2], [x0]\n"
    "b 67f\n"
    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 67f\n"
    "ld1 { v22.b }[0], [x0]\n"
    "67:"  // Oddments: Load (3, 5): Bit 2: End
    "usubl v22.8h, v22.8b, v9.8b\n"
    "ldr d0, [x3, #0x78]\n"
    "smlal v8.4s, v22.4h, v4.4h\n"
    "ldr x15, [x25, #0xc0]\n"
    "add x15, x15, x10\n"
    "smlal2 v5.4s, v22.8h, v4.8h\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v31.4h, v0.4h\n"
    "smlal2 v18.4s, v31.8h, v0.8h\n"
    "smlal v16.4s, v30.4h, v0.4h\n"
    "smlal2 v21.4s, v30.8h, v0.8h\n"
    "tbz x4, #2, 69f\n"
    "ld1 { v27.s }[0], [x15], #0x4\n"
    "tbz x4, #1, 68f\n"
    "ld1 { v27.h }[2], [x15], #0x2\n"
    "tbz x4, #0, 71f\n"
    "ld1 { v27.b }[6], [x15]\n"
    "b 71f\n"
    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
    "tbz x4, #0, 71f\n"
    "ld1 { v27.b }[4], [x15]\n"
    "b 71f\n"
    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
    "tbz x4, #1, 70f\n"
    "ld1 { v27.h }[0], [x15], #0x2\n"
    "tbz x4, #0, 71f\n"
    "ld1 { v27.b }[2], [x15]\n"
    "b 71f\n"
    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 71f\n"
    "ld1 { v27.b }[0], [x15]\n"
    "71:"  // Oddments: Load (4, 0): Bit 2: End
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ldr x9, [x25, #0xc8]\n"
    "smlal v7.4s, v27.4h, v0.4h\n"
    "add x9, x9, x10\n"
    "smlal2 v17.4s, v27.8h, v0.8h\n"
    "tbz x4, #2, 73f\n"
    "ld1 { v23.s }[0], [x9], #0x4\n"
    "tbz x4, #1, 72f\n"
    "ld1 { v23.h }[2], [x9], #0x2\n"
    "tbz x4, #0, 75f\n"
    "ld1 { v23.b }[6], [x9]\n"
    "b 75f\n"
    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
    "tbz x4, #0, 75f\n"
    "ld1 { v23.b }[4], [x9]\n"
    "b 75f\n"
    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
    "tbz x4, #1, 74f\n"
    "ld1 { v23.h }[0], [x9], #0x2\n"
    "tbz x4, #0, 75f\n"
    "ld1 { v23.b }[2], [x9]\n"
    "b 75f\n"
    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 75f\n"
    "ld1 { v23.b }[0], [x9]\n"
    "75:"  // Oddments: Load (4, 1): Bit 2: End
    "usubl v23.8h, v23.8b, v9.8b\n"
    "ldr d1, [x3, #0x80]\n"
    "smlal v8.4s, v23.4h, v0.4h\n"
    "ldr x27, [x25, #0xd0]\n"
    "add x27, x27, x10\n"
    "smlal2 v5.4s, v23.8h, v0.8h\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v1.4h\n"
    "smlal2 v18.4s, v30.8h, v1.8h\n"
    "smlal v16.4s, v26.4h, v1.4h\n"
    "smlal2 v21.4s, v26.8h, v1.8h\n"
    "smlal v7.4s, v23.4h, v1.4h\n"
    "smlal2 v17.4s, v23.8h, v1.8h\n"
    "tbz x4, #2, 77f\n"
    "ld1 { v31.s }[0], [x27], #0x4\n"
    "tbz x4, #1, 76f\n"
    "ld1 { v31.h }[2], [x27], #0x2\n"
    "tbz x4, #0, 79f\n"
    "ld1 { v31.b }[6], [x27]\n"
    "b 79f\n"
    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
    "tbz x4, #0, 79f\n"
    "ld1 { v31.b }[4], [x27]\n"
    "b 79f\n"
    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
    "tbz x4, #1, 78f\n"
    "ld1 { v31.h }[0], [x27], #0x2\n"
    "tbz x4, #0, 79f\n"
    "ld1 { v31.b }[2], [x27]\n"
    "b 79f\n"
    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 79f\n"
    "ld1 { v31.b }[0], [x27]\n"
    "79:"  // Oddments: Load (4, 2): Bit 2: End
    "usubl v31.8h, v31.8b, v9.8b\n"
    "ldr d2, [x3, #0x88]\n"
    "smlal v8.4s, v31.4h, v1.4h\n"
    "ldr x28, [x25, #0xd8]\n"
    "add x28, x28, x10\n"
    "smlal2 v5.4s, v31.8h, v1.8h\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v26.4h, v2.4h\n"
    "smlal2 v18.4s, v26.8h, v2.8h\n"
    "smlal v16.4s, v25.4h, v2.4h\n"
    "smlal2 v21.4s, v25.8h, v2.8h\n"
    "smlal v7.4s, v31.4h, v2.4h\n"
    "smlal2 v17.4s, v31.8h, v2.8h\n"
    "tbz x4, #2, 81f\n"
    "ld1 { v30.s }[0], [x28], #0x4\n"
    "tbz x4, #1, 80f\n"
    "ld1 { v30.h }[2], [x28], #0x2\n"
    "tbz x4, #0, 83f\n"
    "ld1 { v30.b }[6], [x28]\n"
    "b 83f\n"
    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
    "tbz x4, #0, 83f\n"
    "ld1 { v30.b }[4], [x28]\n"
    "b 83f\n"
    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
    "tbz x4, #1, 82f\n"
    "ld1 { v30.h }[0], [x28], #0x2\n"
    "tbz x4, #0, 83f\n"
    "ld1 { v30.b }[2], [x28]\n"
    "b 83f\n"
    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 83f\n"
    "ld1 { v30.b }[0], [x28]\n"
    "83:"  // Oddments: Load (4, 3): Bit 2: End
    "usubl v30.8h, v30.8b, v9.8b\n"
    "ldr d3, [x3, #0x90]\n"
    "smlal v8.4s, v30.4h, v2.4h\n"
    "ldr x12, [x25, #0xe0]\n"
    "add x12, x12, x10\n"
    "smlal2 v5.4s, v30.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v25.4h, v3.4h\n"
    "smlal2 v18.4s, v25.8h, v3.8h\n"
    "smlal v16.4s, v24.4h, v3.4h\n"
    "smlal2 v21.4s, v24.8h, v3.8h\n"
    "smlal v7.4s, v30.4h, v3.4h\n"
    "smlal2 v17.4s, v30.8h, v3.8h\n"
    "tbz x4, #2, 85f\n"
    "ld1 { v28.s }[0], [x12], #0x4\n"
    "tbz x4, #1, 84f\n"
    "ld1 { v28.h }[2], [x12], #0x2\n"
    "tbz x4, #0, 87f\n"
    "ld1 { v28.b }[6], [x12]\n"
    "b 87f\n"
    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
    "tbz x4, #0, 87f\n"
    "ld1 { v28.b }[4], [x12]\n"
    "b 87f\n"
    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
    "tbz x4, #1, 86f\n"
    "ld1 { v28.h }[0], [x12], #0x2\n"
    "tbz x4, #0, 87f\n"
    "ld1 { v28.b }[2], [x12]\n"
    "b 87f\n"
    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 87f\n"
    "ld1 { v28.b }[0], [x12]\n"
    "87:"  // Oddments: Load (4, 4): Bit 2: End
    "usubl v28.8h, v28.8b, v9.8b\n"
    "ldr d4, [x3, #0x98]\n"
    "smlal v8.4s, v28.4h, v3.4h\n"
    "ldr x7, [x25, #0xe8]\n"
    "add x7, x7, x10\n"
    "smlal2 v5.4s, v28.8h, v3.8h\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v24.4h, v4.4h\n"
    "smlal2 v18.4s, v24.8h, v4.8h\n"
    "smlal v16.4s, v22.4h, v4.4h\n"
    "smlal2 v21.4s, v22.8h, v4.8h\n"
    "smlal v7.4s, v28.4h, v4.4h\n"
    "smlal2 v17.4s, v28.8h, v4.8h\n"
    "tbz x4, #2, 89f\n"
    "ld1 { v26.s }[0], [x7], #0x4\n"
    "tbz x4, #1, 88f\n"
    "ld1 { v26.h }[2], [x7], #0x2\n"
    "tbz x4, #0, 91f\n"
    "ld1 { v26.b }[6], [x7]\n"
    "b 91f\n"
    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
    "tbz x4, #0, 91f\n"
    "ld1 { v26.b }[4], [x7]\n"
    "b 91f\n"
    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
    "tbz x4, #1, 90f\n"
    "ld1 { v26.h }[0], [x7], #0x2\n"
    "tbz x4, #0, 91f\n"
    "ld1 { v26.b }[2], [x7]\n"
    "b 91f\n"
    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 91f\n"
    "ld1 { v26.b }[0], [x7]\n"
    "91:"  // Oddments: Load (4, 5): Bit 2: End
    "usubl v26.8h, v26.8b, v9.8b\n"
    "ldr d0, [x3, #0xa0]\n"
    "smlal v8.4s, v26.4h, v4.4h\n"
    "ldr x26, [x25, #0xf0]\n"
    "add x26, x26, x10\n"
    "smlal2 v5.4s, v26.8h, v4.8h\n"
    "ssubl v0.8h, v0.8b, v14.8b\n"
    "smlal v15.4s, v27.4h, v0.4h\n"
    "smlal2 v18.4s, v27.8h, v0.8h\n"
    "smlal v16.4s, v23.4h, v0.4h\n"
    "smlal2 v21.4s, v23.8h, v0.8h\n"
    "tbz x4, #2, 93f\n"
    "ld1 { v25.s }[0], [x26], #0x4\n"
    "tbz x4, #1, 92f\n"
    "ld1 { v25.h }[2], [x26], #0x2\n"
    "tbz x4, #0, 95f\n"
    "ld1 { v25.b }[6], [x26]\n"
    "b 95f\n"
    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
    "tbz x4, #0, 95f\n"
    "ld1 { v25.b }[4], [x26]\n"
    "b 95f\n"
    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
    "tbz x4, #1, 94f\n"
    "ld1 { v25.h }[0], [x26], #0x2\n"
    "tbz x4, #0, 95f\n"
    "ld1 { v25.b }[2], [x26]\n"
    "b 95f\n"
    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 95f\n"
    "ld1 { v25.b }[0], [x26]\n"
    "95:"  // Oddments: Load (5, 0): Bit 2: End
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ldr x23, [x25, #0xf8]\n"
    "smlal v7.4s, v25.4h, v0.4h\n"
    "add x23, x23, x10\n"
    "smlal2 v17.4s, v25.8h, v0.8h\n"
    "tbz x4, #2, 97f\n"
    "ld1 { v24.s }[0], [x23], #0x4\n"
    "tbz x4, #1, 96f\n"
    "ld1 { v24.h }[2], [x23], #0x2\n"
    "tbz x4, #0, 99f\n"
    "ld1 { v24.b }[6], [x23]\n"
    "b 99f\n"
    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
    "tbz x4, #0, 99f\n"
    "ld1 { v24.b }[4], [x23]\n"
    "b 99f\n"
    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
    "tbz x4, #1, 98f\n"
    "ld1 { v24.h }[0], [x23], #0x2\n"
    "tbz x4, #0, 99f\n"
    "ld1 { v24.b }[2], [x23]\n"
    "b 99f\n"
    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 99f\n"
    "ld1 { v24.b }[0], [x23]\n"
    "99:"  // Oddments: Load (5, 1): Bit 2: End
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ldr d1, [x3, #0xa8]\n"
    "smlal v8.4s, v24.4h, v0.4h\n"
    "ldr x22, [x25, #0x100]\n"
    "add x22, x22, x10\n"
    "smlal2 v5.4s, v24.8h, v0.8h\n"
    "ssubl v1.8h, v1.8b, v14.8b\n"
    "smlal v15.4s, v23.4h, v1.4h\n"
    "smlal2 v18.4s, v23.8h, v1.8h\n"
    "smlal v16.4s, v31.4h, v1.4h\n"
    "smlal2 v21.4s, v31.8h, v1.8h\n"
    "smlal v7.4s, v24.4h, v1.4h\n"
    "smlal2 v17.4s, v24.8h, v1.8h\n"
    "tbz x4, #2, 101f\n"
    "ld1 { v27.s }[0], [x22], #0x4\n"
    "tbz x4, #1, 100f\n"
    "ld1 { v27.h }[2], [x22], #0x2\n"
    "tbz x4, #0, 103f\n"
    "ld1 { v27.b }[6], [x22]\n"
    "b 103f\n"
    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
    "tbz x4, #0, 103f\n"
    "ld1 { v27.b }[4], [x22]\n"
    "b 103f\n"
    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
    "tbz x4, #1, 102f\n"
    "ld1 { v27.h }[0], [x22], #0x2\n"
    "tbz x4, #0, 103f\n"
    "ld1 { v27.b }[2], [x22]\n"
    "b 103f\n"
    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 103f\n"
    "ld1 { v27.b }[0], [x22]\n"
    "103:"  // Oddments: Load (5, 2): Bit 2: End
    "usubl v27.8h, v27.8b, v9.8b\n"
    "ldr d2, [x3, #0xb0]\n"
    "smlal v8.4s, v27.4h, v1.4h\n"
    "ldr x20, [x25, #0x108]\n"
    "add x20, x20, x10\n"
    "smlal2 v5.4s, v27.8h, v1.8h\n"
    "ssubl v2.8h, v2.8b, v14.8b\n"
    "smlal v15.4s, v31.4h, v2.4h\n"
    "smlal2 v18.4s, v31.8h, v2.8h\n"
    "smlal v16.4s, v30.4h, v2.4h\n"
    "smlal2 v21.4s, v30.8h, v2.8h\n"
    "smlal v7.4s, v27.4h, v2.4h\n"
    "smlal2 v17.4s, v27.8h, v2.8h\n"
    "tbz x4, #2, 105f\n"
    "ld1 { v25.s }[0], [x20], #0x4\n"
    "tbz x4, #1, 104f\n"
    "ld1 { v25.h }[2], [x20], #0x2\n"
    "tbz x4, #0, 107f\n"
    "ld1 { v25.b }[6], [x20]\n"
    "b 107f\n"
    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
    "tbz x4, #0, 107f\n"
    "ld1 { v25.b }[4], [x20]\n"
    "b 107f\n"
    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
    "tbz x4, #1, 106f\n"
    "ld1 { v25.h }[0], [x20], #0x2\n"
    "tbz x4, #0, 107f\n"
    "ld1 { v25.b }[2], [x20]\n"
    "b 107f\n"
    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 107f\n"
    "ld1 { v25.b }[0], [x20]\n"
    "107:"  // Oddments: Load (5, 3): Bit 2: End
    "usubl v25.8h, v25.8b, v9.8b\n"
    "ldr d3, [x3, #0xb8]\n"
    "smlal v8.4s, v25.4h, v2.4h\n"
    "ldr x13, [x25, #0x110]\n"
    "add x13, x13, x10\n"
    "smlal2 v5.4s, v25.8h, v2.8h\n"
    "ssubl v3.8h, v3.8b, v14.8b\n"
    "smlal v15.4s, v30.4h, v3.4h\n"
    "smlal2 v18.4s, v30.8h, v3.8h\n"
    "smlal v16.4s, v28.4h, v3.4h\n"
    "smlal2 v21.4s, v28.8h, v3.8h\n"
    "smlal v7.4s, v25.4h, v3.4h\n"
    "smlal2 v17.4s, v25.8h, v3.8h\n"
    "tbz x4, #2, 109f\n"
    "ld1 { v24.s }[0], [x13], #0x4\n"
    "tbz x4, #1, 108f\n"
    "ld1 { v24.h }[2], [x13], #0x2\n"
    "tbz x4, #0, 111f\n"
    "ld1 { v24.b }[6], [x13]\n"
    "b 111f\n"
    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
    "tbz x4, #0, 111f\n"
    "ld1 { v24.b }[4], [x13]\n"
    "b 111f\n"
    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
    "tbz x4, #1, 110f\n"
    "ld1 { v24.h }[0], [x13], #0x2\n"
    "tbz x4, #0, 111f\n"
    "ld1 { v24.b }[2], [x13]\n"
    "b 111f\n"
    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 111f\n"
    "ld1 { v24.b }[0], [x13]\n"
    "111:"  // Oddments: Load (5, 4): Bit 2: End
    "usubl v24.8h, v24.8b, v9.8b\n"
    "ldr d4, [x3, #0xc0]\n"
    "smlal v8.4s, v24.4h, v3.4h\n"
    "ldr x21, [x25, #0x118]\n"
    "add x21, x21, x10\n"
    "smlal2 v5.4s, v24.8h, v3.8h\n"
    "ssubl v4.8h, v4.8b, v14.8b\n"
    "smlal v15.4s, v28.4h, v4.4h\n"
    "smlal2 v18.4s, v28.8h, v4.8h\n"
    "smlal v16.4s, v26.4h, v4.4h\n"
    "smlal2 v21.4s, v26.8h, v4.8h\n"
    "smlal v7.4s, v24.4h, v4.4h\n"
    "smlal2 v17.4s, v24.8h, v4.8h\n"
    "tbz x4, #2, 113f\n"
    "ld1 { v27.s }[0], [x21], #0x4\n"
    "tbz x4, #1, 112f\n"
    "ld1 { v27.h }[2], [x21], #0x2\n"
    "tbz x4, #0, 115f\n"
    "ld1 { v27.b }[6], [x21]\n"
    "b 115f\n"
    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
    "tbz x4, #0, 115f\n"
    "ld1 { v27.b }[4], [x21]\n"
    "b 115f\n"
    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
    "tbz x4, #1, 114f\n"
    "ld1 { v27.h }[0], [x21], #0x2\n"
    "tbz x4, #0, 115f\n"
    "ld1 { v27.b }[2], [x21]\n"
    "b 115f\n"
    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 115f\n"
    "ld1 { v27.b }[0], [x21]\n"
    "115:"  // Oddments: Load (5, 5): Bit 2: End
    "usubl v27.8h, v27.8b, v9.8b\n"
    "smlal v8.4s, v27.4h, v4.4h\n"
    "smlal2 v5.4s, v27.8h, v4.8h\n"
    "tbz x4, #2, 117f\n"
    "ld1 { v6.4s }, [x2], #0x10\n"
    "ld1 { v19.4s }, [x5], #0x10\n"
    "tbz x4, #1, 116f\n"
    "ld1 { v20.d }[0], [x2], #0x8\n"
    "ld1 { v12.d }[0], [x5], #0x8\n"
    "tbz x4, #0, 119f\n"
    "ld1 { v20.s }[2], [x2]\n"
    "ld1 { v12.s }[2], [x5]\n"
    "b 119f\n"
    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
    "tbz x4, #0, 119f\n"
    "ld1 { v20.s }[0], [x2]\n"
    "ld1 { v12.s }[0], [x5]\n"
    "b 119f\n"
    "117:"  // Oddments: Load requant params: Bit 2: Unset
    "tbz x4, #1, 118f\n"
    "ld1 { v6.d }[0], [x2], #0x8\n"
    "ld1 { v19.d }[0], [x5], #0x8\n"
    "tbz x4, #0, 119f\n"
    "ld1 { v6.s }[2], [x2]\n"
    "ld1 { v19.s }[2], [x5]\n"
    "b 119f\n"
    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 119f\n"
    "ld1 { v6.s }[0], [x2]\n"
    "ld1 { v19.s }[0], [x5]\n"
    "119:"  // Oddments: Load requant params: Bit 2: End
    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
    "add x17, x17, x1\n"
    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
    "add x16, x16, x1\n"
    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
    "add x6, x6, x1\n"
    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
    "add x8, x8, x1\n"
    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
    "and v28.16b, v15.16b, v19.16b\n"
    "and v26.16b, v18.16b, v12.16b\n"
    "and v29.16b, v16.16b, v19.16b\n"
    "sshr v28.4s, v28.4s, #0x1f\n"
    "sshr v26.4s, v26.4s, #0x1f\n"
    "sshr v29.4s, v29.4s, #0x1f\n"
    "sqadd v15.4s, v15.4s, v28.4s\n"
    "sqadd v18.4s, v18.4s, v26.4s\n"
    "sqadd v16.4s, v16.4s, v29.4s\n"
    "and v4.16b, v21.16b, v12.16b\n"
    "srshl v15.4s, v15.4s, v19.4s\n"
    "srshl v18.4s, v18.4s, v12.4s\n"
    "srshl v16.4s, v16.4s, v19.4s\n"
    "sshr v4.4s, v4.4s, #0x1f\n"
    "add v15.4s, v15.4s, v10.4s\n"
    "add v18.4s, v18.4s, v10.4s\n"
    "add v16.4s, v16.4s, v10.4s\n"
    "smin v15.4s, v15.4s, v13.4s\n"
    "smin v18.4s, v18.4s, v13.4s\n"
    "smin v16.4s, v16.4s, v13.4s\n"
    "smax v15.4s, v15.4s, v11.4s\n"
    "smax v18.4s, v18.4s, v11.4s\n"
    "smax v16.4s, v16.4s, v11.4s\n"
    "sqadd v21.4s, v21.4s, v4.4s\n"
    "uzp1 v15.16b, v15.16b, v18.16b\n"
    "and v25.16b, v7.16b, v19.16b\n"
    "uzp1 v15.16b, v15.16b, v15.16b\n"
    "srshl v21.4s, v21.4s, v12.4s\n"
    "sshr v25.4s, v25.4s, #0x1f\n"
    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "sqadd v7.4s, v7.4s, v25.4s\n"
    "and v31.16b, v17.16b, v12.16b\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "and v24.16b, v8.16b, v19.16b\n"
    "srshl v7.4s, v7.4s, v19.4s\n"
    "smax v21.4s, v21.4s, v11.4s\n"
    "sshr v31.4s, v31.4s, #0x1f\n"
    "sshr v24.4s, v24.4s, #0x1f\n"
    "uzp1 v16.16b, v16.16b, v21.16b\n"
    "add v7.4s, v7.4s, v10.4s\n"
    "uzp1 v16.16b, v16.16b, v16.16b\n"
    "sqadd v17.4s, v17.4s, v31.4s\n"
    "smin v7.4s, v7.4s, v13.4s\n"
    "sqadd v8.4s, v8.4s, v24.4s\n"
    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
    "smax v7.4s, v7.4s, v11.4s\n"
    "srshl v17.4s, v17.4s, v12.4s\n"
    "srshl v8.4s, v8.4s, v19.4s\n"
    "and v1.16b, v5.16b, v12.16b\n"
    "add v17.4s, v17.4s, v10.4s\n"
    "add v8.4s, v8.4s, v10.4s\n"
    "sshr v1.4s, v1.4s, #0x1f\n"
    "smin v17.4s, v17.4s, v13.4s\n"
    "smin v8.4s, v8.4s, v13.4s\n"
    "sqadd v5.4s, v5.4s, v1.4s\n"
    "smax v17.4s, v17.4s, v11.4s\n"
    "smax v8.4s, v8.4s, v11.4s\n"
    "srshl v5.4s, v5.4s, v12.4s\n"
    "uzp1 v7.16b, v7.16b, v17.16b\n"
    "uzp1 v7.16b, v7.16b, v7.16b\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "smax v5.4s, v5.4s, v11.4s\n"
    "uzp1 v8.16b, v8.16b, v5.16b\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "tbz x4, #2, 121f\n"
    "st1 { v15.s }[0], [x17], #0x4\n"
    "st1 { v16.s }[0], [x16], #0x4\n"
    "st1 { v7.s }[0], [x6], #0x4\n"
    "st1 { v8.s }[0], [x8], #0x4\n"
    "tbz x4, #1, 120f\n"
    "st1 { v15.h }[2], [x17], #0x2\n"
    "st1 { v16.h }[2], [x16], #0x2\n"
    "st1 { v7.h }[2], [x6], #0x2\n"
    "st1 { v8.h }[2], [x8], #0x2\n"
    "tbz x4, #0, 123f\n"
    "st1 { v15.b }[6], [x17], #0x1\n"
    "st1 { v16.b }[6], [x16], #0x1\n"
    "st1 { v7.b }[6], [x6], #0x1\n"
    "st1 { v8.b }[6], [x8], #0x1\n"
    "b 123f\n"
    "120:"  // Oddments: Bit 2: Bit 1: Unset
    "tbz x4, #0, 123f\n"
    "st1 { v15.b }[4], [x17], #0x1\n"
    "st1 { v16.b }[4], [x16], #0x1\n"
    "st1 { v7.b }[4], [x6], #0x1\n"
    "st1 { v8.b }[4], [x8], #0x1\n"
    "b 123f\n"
    "121:"  // Oddments: Bit 2: Unset
    "tbz x4, #1, 122f\n"
    "st1 { v15.h }[0], [x17], #0x2\n"
    "st1 { v16.h }[0], [x16], #0x2\n"
    "st1 { v7.h }[0], [x6], #0x2\n"
    "st1 { v8.h }[0], [x8], #0x2\n"
    "tbz x4, #0, 123f\n"
    "st1 { v15.b }[2], [x17], #0x1\n"
    "st1 { v16.b }[2], [x16], #0x1\n"
    "st1 { v7.b }[2], [x6], #0x1\n"
    "st1 { v8.b }[2], [x8], #0x1\n"
    "b 123f\n"
    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
    "tbz x4, #0, 123f\n"
    "st1 { v15.b }[0], [x17], #0x1\n"
    "st1 { v16.b }[0], [x16], #0x1\n"
    "st1 { v7.b }[0], [x6], #0x1\n"
    "st1 { v8.b }[0], [x8], #0x1\n"
    "123:"  // Oddments: Bit 2: End

    "124:"  // End

    :
    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
  );
}

}  // namespace depthwise
}  // namespace arm_conv

#endif  // defined(__aarch64__)
